#R libraries
library(knitr)
library(yaml)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
require(graphics)
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggthemes)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(stringr)
library(stringi)
library(mapproj)
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(readr)
library(rio)
##
## Attaching package: 'rio'
## The following object is masked from 'package:plotly':
##
## export
library(naniar)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(grid)
#Beer and Breweries data input from CSV files
# This reads in the Beers data from select folder file Beers.csv.
Beerdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Beers.csv",
header = T,sep = ",",na.strings = "NA",fill = TRUE)
head(Beerdata)
## Name Beer_ID ABV IBU Brewery_id
## 1 Pub Beer 1436 0.050 NA 409
## 2 Devil's Cup 2265 0.066 NA 178
## 3 Rise of the Phoenix 2264 0.071 NA 178
## 4 Sinister 2263 0.090 NA 178
## 5 Sex and Candy 2262 0.075 NA 178
## 6 Black Exodus 2261 0.077 NA 178
## Style Ounces
## 1 American Pale Lager 12
## 2 American Pale Ale (APA) 12
## 3 American IPA 12
## 4 American Double / Imperial IPA 12
## 5 American IPA 12
## 6 Oatmeal Stout 12
#Number of rows in Beer data
nrow(Beerdata)
## [1] 2410
# This reads in the Beers data from select folder file Breweries.csv.
Breweriesdata <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Breweries.csv",
header = T,sep = ",",na.strings = "NA", fill = TRUE)
head(Breweriesdata)
## Brew_ID Name City State
## 1 1 NorthGate Brewing Minneapolis MN
## 2 2 Against the Grain Brewery Louisville KY
## 3 3 Jack's Abby Craft Lagers Framingham MA
## 4 4 Mike Hess Brewing Company San Diego CA
## 5 5 Fort Point Beer Company San Francisco CA
## 6 6 COAST Brewing Company Charleston SC
#Number of rows in Breweries data
nrow(Breweriesdata)
## [1] 558
#Removing not applicable ABV values
Beerdata1 <- Beerdata %>% filter(Beerdata$ABV != "NA")
Beerdata1$Ounces <- as.factor(Beerdata1$Ounces)
#Summary count of number of ABV values by categorized Ounces of beer
s <- Beerdata1 %>% group_by(Ounces)%>% summarize(ABV,count=n())
## `summarise()` regrouping output by 'Ounces' (override with `.groups` argument)
# Bar plot of ABV v. Ounces to see how much data is available
Beerdata1 %>% ggplot(aes(x = Ounces, y = s$count[1], fill=Ounces)) + geom_col() +
ggtitle("Alcohol By Volume (ABV) data count by Ounce Category") +
labs(y="Alcohol By Volume (ABV)") +
geom_text(aes(Ounces, s$count+100, label = s$count, fill = NULL), data = s)
#Generating geographic plots from breweries data (following chunk generate the final plot)
#makes a data frame with State name and abbreviation.
lookup = data.frame(abb = state.abb, State = state.name)
dc <- c("DC", "District of Columbia")
lookup <- rbind(lookup,dc)
# Change Column Name State to abb (abbreviation)
colnames(Breweriesdata)[4] = "abb"
# Removes left space in state abb data taken from Breweries CSV
Breweriesdata$abb <- trimws(Breweriesdata$abb,which = c("left"))
# make one dataset with state names and abb
Brewdata <- merge(Breweriesdata,lookup,"abb")
Brewcount <- count(Brewdata,State,abb) #count up the occurrence of each state
colnames(Brewcount)[3] = "Breweries_Count" #change "n" to "Breweries_Count"
# added state name also changed to lower case
Brewcount$region <- tolower(Brewcount$State)
Brewcount2 = Brewcount[-1] #removed first column from Brew count data state name
states <- map_data("state") #contains data for states excluding Alaska, Hawaii
#Added Alaska to states as the states data does not include Alaska (Hawaii not in data so did not include in below data)
Alaska <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/Alaska.csv",header = T,sep = ",",na.strings = "NA",
fill = TRUE);
states <- rbind(states,Alaska)
#map.df is data frame containing longitude and latitude to form state and breweries count by state
map.df <- merge(states,Brewcount2, by="region", all.x=T)
map.df <- map.df[order(map.df$order),]
#Breweries count by state using gradient graphics
plot <- map.df %>% ggplot(aes(x=long, y=lat, group = group)) +
geom_polygon(aes(fill = Breweries_Count)) + geom_path() +
scale_fill_gradientn(colours=rev(topo.colors(10)),na.value="grey90",
breaks=c(0,5,10,15,20,25,30,35,40,45,50))+
ggtitle("Breweries Count by State") + coord_map()
#state center longitude and latitude (read data from csv file)
st_center <- read.csv("https://raw.githubusercontent.com/VenkataVanga/MSDS-6306/main/long_lat_statecenter.csv",header = T,sep = ",",na.strings = "NA",
fill = TRUE);
# state names changed to lower case
st_center$region <- tolower(st_center$region)
#map.df1 merges state center with Breweries count data
map.df1 <- merge(st_center,Brewcount2, by="region", all.x = T)
# Adding state names and Breweries count data by state
Nplot <- plot + geom_point(data=map.df1, aes(x=long, y=lat,
size = Breweries_Count, group = region),
show.legend = FALSE) +
geom_text(aes( x= long, y = lat,label = abb.x, group = region),
data = map.df1, vjust=-1.2) +
geom_text(aes(x = long, y = lat, label = Breweries_Count, group = region),
data = map.df1,vjust=1.8)
#Code below adds scaled legend to the Breweries count by state
cplot <- Nplot + theme(legend.background = element_rect(fill="gray90",
size=10, linetype="dotted"),
legend.key.height = unit(4,"lines")) +
theme(plot.title = element_text(size=14, face= "bold", colour= "black"))
#Final plot with state names and Breweries count by state
cplot #plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)
The final plot shows that ‘Colorado’ has the highest (47)number of breweries followed by ‘California (39)’, ‘Michigan(32)’ and ‘Oregon(29)’ being the next three successors.
#Merging beer data and breweries data
colnames(Beerdata)[5] = "Brew_ID" #Changing columun name to combine both beer data and breweries data
Totaldata <- merge(Beerdata,Breweriesdata, by="Brew_ID", all.x = T)
colnames(Totaldata)[2] = "Beer_Name" #Changing the column name to Beer name
colnames(Totaldata)[8] = "Brewery_Name" # Changing the column name to Brewery name
colnames(Totaldata)[10] = "State" # Changing the column name back to State
#First 6 observations after merging beer data and breweries data
head(Totaldata,6)
## Brew_ID Beer_Name Beer_ID ABV IBU Style
## 1 1 Get Together 2692 0.045 50 American IPA
## 2 1 Maggie's Leap 2691 0.049 26 Milk / Sweet Stout
## 3 1 Wall's End 2690 0.048 19 English Brown Ale
## 4 1 Pumpion 2689 0.060 38 Pumpkin Ale
## 5 1 Stronghold 2688 0.060 25 American Porter
## 6 1 Parapet ESB 2687 0.056 47 Extra Special / Strong Bitter (ESB)
## Ounces Brewery_Name City State
## 1 16 NorthGate Brewing Minneapolis MN
## 2 16 NorthGate Brewing Minneapolis MN
## 3 16 NorthGate Brewing Minneapolis MN
## 4 16 NorthGate Brewing Minneapolis MN
## 5 16 NorthGate Brewing Minneapolis MN
## 6 16 NorthGate Brewing Minneapolis MN
#Last 6 observations after merging beer data and breweries data
tail(Totaldata,6)
## Brew_ID Beer_Name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_Name City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Missing Values in ABV/IBU columns
table(is.na(Totaldata$ABV))# True value represents NA values in the column ABV in original data
##
## FALSE TRUE
## 2348 62
table(is.na(Totaldata$IBU))# True value represents NA values in the column IBU in original data
##
## FALSE TRUE
## 1405 1005
gg_miss_var(Totaldata)+ ggtitle("Missing data values by Category")
Graph and the values in table above provide the missing data values in ABV (65) and IBU (1005) columns from originally considered beer data.
#Median for ABV and IBU content for each state (final plot given in below chunk)
#Evaluating median of ABV and IBU (all NA values removed from data)
Median_ABV_IBU <- Totaldata %>% arrange(State) %>% group_by(State) %>%
summarize(Median_ABV = median(ABV, na.rm = TRUE), Median_IBU = median(IBU,na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV_IBU <- data.frame(Median_ABV_IBU)
#Bar plot for median of ABV content for each state (This plot is added to IBU med plot below)
ABV_Medplot <- Median_ABV_IBU %>% ggplot() +
geom_bar(aes(x = State, y = Median_ABV, fill = State),group = Median_ABV_IBU$State,
width = 0.3, stat = 'identity', show.legend = FALSE)
#Bar plot for median of IBU content for each state (added to median of ABV plot)
ABV_IBU_Medplot <- ABV_Medplot + geom_bar(aes(x = as.numeric(factor(State)) + 0.4,
y = Median_IBU/1000),stat = 'identity',width = 0.3)
#Median IBU scaled down by 100 in order to set same y axis scale for ABV and IBU.
#Adding secondary y axis to the plot
ABV_IBU_Medplot1 <- ABV_IBU_Medplot +
scale_y_continuous(limits= c(0,0.072), labels = percent, name = "Alcohol By Volume (ABV) [%]",
sec.axis = sec_axis(~.*1000,name = "International Bitterness Unit (IBU)")) +
ggtitle('Median ABV and IBU Content by State') + theme(
# AXIS LABLES APPEARANCE
plot.title = element_text(size=14, face= "bold", colour= "black" ),
axis.title.x = element_text(size=12, face="bold", colour = "black"),
axis.title.y = element_text(size=12, face="bold", colour = "black"),
axis.text.x = element_text(size=12, face="bold", colour = "black"),
axis.text.y = element_text(size=12, face="bold", colour = "black"),
strip.text.x = element_text(size = 10, face="bold", colour = "black" ),
strip.text.y = element_text(size = 10, face="bold", colour = "black"),
)
#Adding text to median values for ABV and IBU
ABV_IBU_Medplot2 <- ABV_IBU_Medplot1 + geom_text(aes(State, Median_ABV+0.0045,
label = paste("ABV =",percent(Median_ABV,
accuracy = 0.001)),
fill = NULL),
data = Median_ABV_IBU, angle = 90) +
geom_text(aes(as.numeric(factor(State)) + 0.4, (Median_IBU/1000)+0.0032,
label = paste("IBU =",Median_IBU), fill = NULL), data = Median_ABV_IBU, angle = 90)
ABV_IBU_Medplot2
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).
#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)
From the median ABV and median IBU plot above it can be observed that ‘District of Columbia’ and ‘Kentucky’ have the highest median ABV values shown in percentage. ‘Maine’ and ‘West Virginia’ have the highest median IBU values.
#collecting max data
Max_ABV <- Totaldata %>% arrange(State) %>% group_by(State) %>%
summarize(Max_ABV = max(ABV, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Max_IBU <- Totaldata %>% arrange(State) %>% group_by(State) %>%
summarize(Max_IBU = max(IBU, na.rm=TRUE))
## Warning in max(IBU, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV$Cat <- "Max_ABV"
Max_IBU$Cat <- "Max_IBU"
#Bar plot for max ABV content for each state
Maxplot_ABV_IBU <- Max_ABV %>% ggplot(mapping = aes(x,y)) +
geom_bar(aes(x = State, y = Max_ABV, fill = State), group = Max_ABV$State,
stat = 'identity', show.legend = FALSE) +
ggtitle('Max ABV Content by State') + geom_text(aes(State, Max_ABV+0.0085,
label = percent(Max_ABV, accuracy = 0.01),
fill = NULL), data = Max_ABV, angle = 90) +
geom_bar(data=Max_IBU, aes(x = State, y = Max_IBU, fill = State), stat = 'identity',
show.legend = FALSE) +
geom_text(aes(State, Max_IBU+5, label = Max_IBU, fill = NULL), data = Max_IBU) +
ggtitle('Max ABV and Max IBU Content by State') + facet_grid(Cat~., scale = "free_y",
labeller = label_parsed, switch = "y") + labs(x="State") +
theme(axis.title.y = element_blank(), strip.placement = "outside")
Maxplot_ABV_IBU
## Warning: Removed 1 rows containing missing values (geom_bar).
#plot best viewed in scaled full screen zoom 1920px x 1080px (21" or higher dim screen)
From the Max ABV and IBU plot it is observed that ‘Colorado’ has the max ABV value of 12.8% and ‘Oregon’ has the max IBU value of 138.
Stats <- Totaldata %>% summarize(Mean = mean(ABV, na.rm=TRUE),
Median = median(ABV,na.rm=T), Max = max(ABV,na.rm=T), Min = min(ABV,na.rm=T),
SD = sd(ABV,na.rm=T), N = n())
#Histogram and Density Plot
His_Den <- Totaldata %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) +
geom_histogram(aes(y=..density..),colour='red',fill='blue', binwidth = 0.0035) +
geom_density(alpha=.4, fill='#FFFF00') +
ggtitle('ABV Statistical Attributes - Histogram, Density and Box Plots') +
scale_x_continuous(expand = c(0,0), limit = c(0,0.14)) + labs(y="Density / Count")
#Box plot for ABV data
Box <- Totaldata %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV)) +
geom_boxplot(col='black',fill='#FF6666') + scale_x_continuous(expand = c(0,0), limit = c(0,0.14)) + scale_y_continuous(expand = c(0,0), limit = c(-0.4,0.4))
#Histogram density plot and Box plot on same scale
grid.draw(rbind(ggplotGrob(His_Den),
ggplotGrob(Box),
size = "first"))
## Warning: Removed 2 rows containing missing values (geom_bar).
Stats
## Mean Median Max Min SD N
## 1 0.05977342 0.056 0.128 0.001 0.01354173 2410
All statically significant attributes for the ABV values are shown above The distribution of ABV is slightly right skewed. ABV of beers around 5% has the highest count. There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%, the median ABV is 5.6%. The mean ABV is 5.98%, median 5.6% and standard deviation of ABV is 1.35%.
#Scatter plot with
Totaldata %>% filter(!is.na(ABV) & !is.na(IBU)) %>%
ggplot(aes(y=ABV, x=IBU)) + geom_point(position='jitter') + geom_smooth(method=loess)
## `geom_smooth()` using formula 'y ~ x'